Import and process scone evaluation

Note that the scone scores have already been multiplied by (+/-)1 such that a high score corresponds to a good normalization.

sconeResultFiles <- list.files("../../objects",  
                               pattern = "*scores.rds$", full.names = TRUE)
sconeResultFilesShort <- list.files("../../objects",  
                               pattern = "*scores.rds$")
datasets <- unlist(lapply(strsplit(sconeResultFilesShort, split="_"),"[[",1))
datasets[datasets %in% "geschwind2018"] <- "torre-ubieta2018"
scores <- sapply(1:length(sconeResultFiles), function(ii) readRDS(sconeResultFiles[ii]))
names(scores) <- datasets
# remove other methods
rmMethods <- c("gcqn_mean", "gcqn_median_10", "gcqn_median_20",  "gcqn_median_100",
               "gcqn_median_permuted")
scores <- lapply(scores, function(scoreMat){
  scoreNames <- rownames(scoreMat)
  rmID <- unlist(sapply(rmMethods, function(method){
    grep(x=scoreNames, pattern=method)
  }))
  return(scoreMat[-rmID,])
})
# remove unused metrics
rmMetrics <- c("RLE_MED", "RLE_IQR", "rleGC_iqr")
scores <- lapply(scores, function(scoreMat){
  scoreColNames <- colnames(scoreMat)
  rmID <- unlist(sapply(rmMetrics, function(metric){
    grep(x=scoreColNames, pattern=metric)
  }))
  return(scoreMat[,-rmID])
})
# focus on no_uv
keepUV <- "no_uv"
scores <- lapply(scores, function(scoreMat){
  scoreNames <- rownames(scoreMat)
  keepID <- grep(x=scoreNames, pattern=keepUV)
  return(scoreMat[keepID,])
})
# rank scores: high rank is good score according to that metric.
rankedScores <- lapply(scores, function(scoreMat){
  apply(scoreMat,2,rank)
})

Import and process simulation evaluation

simulationResultFiles <- list.files("../../objects",
                          pattern = "simulationResults", full.names = TRUE)
simulationResultFilesShort <- list.files("../../objects",  
                               pattern = "simulationResults")
datasets <- unlist(lapply(strsplit(simulationResultFilesShort, split="_"),"[[",1))
datasets[datasets %in% "geschwind2018"] <- "torre-ubieta2018"
simulationScores <- sapply(1:length(simulationResultFiles), function(ii){
  readRDS(simulationResultFiles[ii])
}, simplify = FALSE)
names(simulationScores) <- datasets
# extract cbd
cbdList <- list()
for(kk in 1:length(simulationScores)){
  cbdList[[kk]] <- simulationScores[[kk]][[2]]$cbd
}
names(cbdList) <- datasets
# remove unused metrics
rmMetricsSim <- c("aufdptpr", "cbd")
simulationScores <- lapply(simulationScores, function(scoreList){
  scoreList[[2]] <- scoreList[[2]][!names(scoreList[[2]]) %in% rmMetricsSim]
  return(scoreList)
})
# high is good
simulationScores <- lapply(simulationScores, function(simScoreList){
  simScoreList[[1]]$fpr <- -abs(simScoreList[[1]]$fpr - 0.05)
  simScoreList[[1]]$distUnifPvalMock <- -simScoreList[[1]]$distUnifPvalMock
  simScoreList[[1]]$helDistVarGC <- -simScoreList[[1]]$helDistVarGC
  simScoreList[[2]]$DAGCDistDiff <- -simScoreList[[2]]$DAGCDistDiff
  return(simScoreList)
})
simulationScoreMats <- lapply(simulationScores, function(simScoreList){
  oneList <- c(simScoreList[[1]], simScoreList[[2]])
  # ensure same order
  nameOrder <- names(oneList[[1]])
  oneList <- lapply(oneList, function(x) x[nameOrder])
  scoreMat <- t(do.call(rbind, oneList))
  return(scoreMat)
})
simulationScoreRanks <- lapply(simulationScoreMats, function(scoreMat){
  apply(scoreMat,2,rank)
})

Merge scores

rankedScores <- lapply(rankedScores, function(scoreMat){
  rownames(scoreMat) <- unlist(lapply(strsplit(rownames(scoreMat), split=","), "[[", 2))
  rownames(scoreMat)[rownames(scoreMat) == "gcqn_median_50"] <- "gcqn"
  rownames(scoreMat)[rownames(scoreMat) == "fq"] <- "FQ"
  if("cqn_length" %in% rownames(scoreMat)){
    scoreMat <- scoreMat[!rownames(scoreMat) == "cqn",]
    rownames(scoreMat)[rownames(scoreMat) == "cqn_length"] <- "cqn"
  }
  rownames(scoreMat)[rownames(scoreMat) == "tmm"] <- "TMM"
  rownames(scoreMat)[rownames(scoreMat) == "deseq"] <- "DESeq2"
  return(scoreMat)
})

# available datasets
dats <- names(simulationScoreRanks)
allScoreRanks <- list()
for(dd in 1:length(dats)){
  sconeScores <- rankedScores[[dats[dd]]]
  simScores <- simulationScoreRanks[[dats[dd]]]
  sconeScores <- sconeScores[rownames(simScores),]
  allScores <- cbind(sconeScores, simScores)
  allScoreRanks[[dd]] <- allScores
}

for(dd in 1:length(allScoreRanks)){
  curScores <- allScoreRanks[[dd]]
  rn <- rownames(curScores)
  rn[rn == "gcqn"] <- "GC-FQ"
  rn[rn == "edaseq"] <- "FQ-FQ"
  rn[rn == "gcqn_smooth"] <- "GC-FQ_smooth"
  rn[rn == "uq"] <- "UQ"
  rn[rn == "none"] <- "None"
  rn[rn == "sum"] <- "Sum"
  rownames(curScores) <- rn
  allScoreRanks[[dd]] <- curScores
}

Performance visualization

For each dataset

library(ggplot2)
gcNormMethods <- c("cqn", "FQ-FQ", "GC-FQ", "GC-FQ_smooth")
pDat <- list()
for(dd in 1:length(dats)){
  curScores <- allScoreRanks[[dd]]
  curScores <- curScores[order(rowMeans(curScores)),]
  df <- data.frame(rank = c(curScores),
                   method = rep(rownames(curScores), ncol(curScores)),
                   metric = rep(colnames(curScores), each=nrow(curScores)))
  df$method <- factor(df$method, levels = rownames(curScores))
  # introduce class on what metrics assess
  df$class <- NA
  df$class[df$metric %in% c("BIO_SIL", "BATCH_SIL", "PAM_SIL", "EXP_QC_COR", "EXP_UV_COR")] <- "norm"
  df$class[df$metric %in% c("rleGC_med", "helDistVarGC", "DAGCDistDiff")] <- "GC"
  df$class[df$metric %in% c("fpr", "distUnifPvalMock", "auroc", "aufdptpr")] <- "DAA"
  # shorter name for metrics
  df$metric <- as.character(df$metric)
  df$metric[df$metric == "BATCH_SIL"] <- "Batch Sil"
  df$metric[df$metric == "BIO_SIL"] <- "Bio Sil"
  df$metric[df$metric == "DAGCDistDiff"] <- "GC-dist DA"
  df$metric[df$metric == "distUnifPvalMock"] <- "p-val unif"
  df$metric[df$metric == "EXP_QC_COR"] <- "QC cor"
  df$metric[df$metric == "EXP_UV_COR"] <- "UV cor"
  df$metric[df$metric == "helDistVarGC"] <- "p-val GC"
  df$metric[df$metric == "PAM_SIL"] <- "PAM Sil"
  df$metric[df$metric == "rleGC_med"] <- "RLE GC"
  df$dataset <- dats[dd]
  if(dd == 1){
    dfAll <- df
  } else {
    dfAll <- rbind(dfAll, df)
  }
  
  
  dfNorm <- df[df$class == "norm",]
  pNorm <- ggplot(dfNorm, aes(x=metric, y=method, fill=rank)) +
    geom_tile() +
    scale_fill_gradient2() +
  theme(legend.position = "none",
          panel.background = element_blank(),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
        axis.text.x = element_text(angle = 25, size=9),
        axis.text.y = element_text(size=12, colour=ifelse(levels(df$method) %in% gcNormMethods,
                                                 "dodgerblue", "black")),
        axis.ticks.y = element_blank()) +
    scale_x_discrete(position = "top") +
    ylab(NULL) +
    xlab("Normalization \n performance") #+
    # annotate("text", x = -0.2, y = 12, label = dats[dd], size=6) +
    # coord_cartesian(ylim = c(0,10), xlim=c(1,length(unique(dfNorm$metric))), clip = "off")
    
  dfDA <- df[df$class == "DAA",]
  pDA <- ggplot(dfDA, aes(x=metric, y=method, fill=rank)) +
    geom_tile() +
    scale_fill_gradient2(low = "skyblue", high="steelblue") +
    theme(legend.position = "none",
          axis.text.y = element_blank(),
          axis.ticks.y = element_blank(),
          panel.background = element_blank(),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
        axis.text.x = element_text(angle = 25, size=9)) +
    scale_x_discrete(position = "top") +
    xlab("Differential analysis \n performance") +
    ylab(NULL)
  
  dfGC <- df[df$class == "GC",]
  pGC <- ggplot(dfGC, aes(x=metric, y=method, fill=rank)) +
    geom_tile() +
    scale_fill_gradient2(low = "darkseagreen2", high="darkseagreen4") +
    theme(legend.position = "none",
          axis.text.y = element_blank(),
          axis.ticks.y = element_blank(),
          panel.background = element_blank(),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
        axis.text.x = element_text(angle = 25, size=9)) +
    scale_x_discrete(position = "top") +
    xlab("GC-content bias \n removal") +
    ylab(NULL)
  
  pDat[[dd]] <- cowplot::plot_grid(pNorm, pDA, pGC,
                     nrow = 1,
                     rel_widths = c(1.5, 1, .85))
}
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
cowplot::plot_grid(plotlist = pDat,
                   nrow = 4, ncol = 2,
                   labels=dats,
                   label_size = 11,
                   label_x = -0.025)

ggsave("~/Dropbox/research/atacseq/bulk/plots/totalBenchmarkPerDataset.pdf",
       height = 14, width = 12)

Across all datasets

library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.1.6     ✓ dplyr   1.0.3
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ✓ purrr   0.3.3
## ── Conflicts ───────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
# get order across all
avNormRanks <- dfAll %>%
  group_by(method) %>%
  summarize(avRank = mean(rank, na.rm=TRUE))
orderedMethods <- as.character(avNormRanks$method[order(avNormRanks$avRank, decreasing=FALSE)])
gcNormMethods <- c("cqn", "FQ-FQ", "GC-FQ", "GC-FQ_smooth")

dfAllNorm <- dfAll[dfAll$class == "norm",]
dfAllNormSummary <- dfAllNorm %>% 
  group_by(dataset, method) %>%
  summarize(meanRank = mean(rank))
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
dfAllNormSummary$method <- factor(dfAllNormSummary$method, levels=orderedMethods)
dfAllNormSummary$dataset <- factor(dfAllNormSummary$dataset,
                                   levels = c("bryois2018", "calderon2019", "fullard2019",
                                              "murphy2019","torre-ubieta2018",
                                              "philip2017", "rizzardi2019",
                                               "liu2019"))
dfAllNormSummaryNorm <- dfAllNormSummary

pNormAll <- ggplot(dfAllNormSummary, aes(x=dataset, y=method, fill=meanRank)) +
    geom_tile() +
    scale_fill_gradient2() +
  theme(legend.position = "none",
          panel.background = element_blank(),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
        axis.text.x = element_text(angle = 25, size=9, vjust=.8, hjust=.35),
        axis.text.y = element_text(size = 14, 
                                   colour=ifelse(orderedMethods %in% gcNormMethods,
                                                 "dodgerblue", "black"))) +
    scale_x_discrete(position = "top") +
    ylab(NULL) +
    xlab("Normalization \n performance")
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## Differential accessibility analysis
dfAllNorm <- dfAll[dfAll$class == "DAA",]
dfAllNormSummary <- dfAllNorm %>% 
  group_by(dataset, method) %>%
  summarize(meanRank = mean(rank))
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
dfAllNormSummary$method <- factor(dfAllNormSummary$method, levels=orderedMethods)
dfAllNormSummary$dataset <- factor(dfAllNormSummary$dataset,
                                   levels = c("bryois2018", "calderon2019", "fullard2019",
                                              "murphy2019","torre-ubieta2018",
                                              "philip2017", "rizzardi2019",
                                               "liu2019"))
dfAllNormSummaryDA <- dfAllNormSummary

pDAAll <- ggplot(dfAllNormSummary, aes(x=dataset, y=method, fill=meanRank)) +
    geom_tile() +
    scale_fill_gradient2(low = "skyblue", high="steelblue") +
    theme(legend.position = "none",
          axis.text.y = element_blank(),
          axis.ticks.y = element_blank(),
          panel.background = element_blank(),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
        axis.text.x = element_text(angle = 25, size=9, vjust=.8, hjust=.35)) +
    scale_x_discrete(position = "top") +
    xlab("Differential analysis \n performance") +
    ylab(NULL)


## GC-content bias removal
dfAllNorm <- dfAll[dfAll$class == "GC",]
dfAllNormSummary <- dfAllNorm %>% 
  group_by(dataset, method) %>%
  summarize(meanRank = mean(rank))
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
# get order
avNormRanks <- dfAllNormSummary %>%
  group_by(method) %>%
  summarize(avRank = mean(meanRank))
orderedMethods <- as.character(avNormRanks$method[order(avNormRanks$avRank, decreasing=FALSE)])
dfAllNormSummary$method <- factor(dfAllNormSummary$method, levels=orderedMethods)
dfAllNormSummary$dataset <- factor(dfAllNormSummary$dataset,
                                   levels = c("bryois2018", "calderon2019", "fullard2019",
                                              "murphy2019","torre-ubieta2018",
                                              "philip2017", "rizzardi2019",
                                               "liu2019"))
dfAllNormSummaryGC <- dfAllNormSummary

pGCAll <- ggplot(dfAllNormSummary, aes(x=dataset, y=method, fill=meanRank)) +
    geom_tile() +
    scale_fill_gradient2(low = "darkseagreen2", high="darkseagreen4") +
    theme(legend.position = "none",
          axis.text.y = element_blank(),
          axis.ticks.y = element_blank(),
          panel.background = element_blank(),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
        axis.text.x = element_text(angle = 25, size=9, vjust=.8, hjust=.35)) +
    scale_x_discrete(position = "top") +
    xlab("GC-content bias \n removal") +
    ylab(NULL)

cowplot::plot_grid(pNormAll, pDAAll, pGCAll,
                     nrow = 1,
                     rel_widths = c(1.5, 1, 1))

ggsave("~/Dropbox/research/atacseq/bulk/plots/totalBenchmarkSummary.pdf",
       height = 6, width = 12)

library(cowplot)
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************
imgFile <- "/Users/koenvandenberge/Dropbox/research/atacseq/bulk/plots/benchmarkSchemev1.png"
pScheme <- ggdraw() +
            draw_image(imgFile,
                       scale=1)


pHeat <- cowplot::plot_grid(pNormAll, pDAAll, pGCAll,
                     nrow = 1,
                     rel_widths = c(1.5, 1, 1))

cowplot::plot_grid(pScheme,
                   pHeat,
                   nrow=2,
                   labels=letters[1:2],
                   rel_heights = c(3/4, 1))

ggsave("~/Dropbox/research/atacseq/bulk/plots/totalBenchmarkSummary_scheme.pdf",
       height = 15, width = 12)

Include global overview

dfAllSummary <- rbind(dfAllNormSummaryNorm,
                      dfAllNormSummaryDA,
                      dfAllNormSummaryGC)
dfAllSummSumm <- dfAllSummary %>% 
  group_by(method) %>%
  summarize(meanmeanRank=mean(meanRank))
dfAllSummSumm$global <- factor(rep("global", nrow(dfAllSummSumm)))

pGlobal <- ggplot(dfAllSummSumm, aes(x=global, y=method, fill=meanmeanRank)) +
    geom_tile() +
    scale_fill_gradient2(low = "darkgoldenrod1", high="darkgoldenrod3") +
    theme(legend.position = "none",
          axis.text.y = element_blank(),
          axis.ticks.y = element_blank(),
          panel.background = element_blank(),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
        axis.text.x = element_text(angle = 25, size=9, vjust=.8, hjust=.35)) +
    scale_x_discrete(position = "top") +
    xlab("GC-content bias \n removal") +
    ylab(NULL)


cowplot::plot_grid(pNormAll, pDAAll, pGCAll, pGlobal,
                     nrow = 1,
                     rel_widths = c(1.5, 1, 1, .2))

Across all datasets, no GC metrics

library(tidyverse)
# get order across all
dfAll_noGC <- dfAll[!dfAll$class == "GC",]
avNormRanks <- dfAll_noGC %>%
  group_by(method) %>%
  summarize(avRank = mean(rank, na.rm=TRUE))
orderedMethods <- as.character(avNormRanks$method[order(avNormRanks$avRank, decreasing=FALSE)])
gcNormMethods <- c("cqn", "FQ-FQ", "GC-FQ", "GC-FQ_smooth")

dfAllNorm <- dfAll_noGC[dfAll_noGC$class == "norm",]
dfAllNormSummary <- dfAllNorm %>% 
  group_by(dataset, method) %>%
  summarize(meanRank = mean(rank))
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
dfAllNormSummary$method <- factor(dfAllNormSummary$method, levels=orderedMethods)
dfAllNormSummary$dataset <- factor(dfAllNormSummary$dataset,
                                   levels = c("bryois2018", "calderon2019", "fullard2019",
                                              "murphy2019","torre-ubieta2018",
                                              "philip2017", "rizzardi2019",
                                               "liu2019"))

pNormAll <- ggplot(dfAllNormSummary, aes(x=dataset, y=method, fill=meanRank)) +
    geom_tile() +
    scale_fill_gradient2() +
  theme(legend.position = "none",
          panel.background = element_blank(),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
        axis.text.x = element_text(angle = 25, size=9, vjust=.8, hjust=.35),
        axis.text.y = element_text(size = 14, 
                                   colour=ifelse(orderedMethods %in% gcNormMethods,
                                                 "dodgerblue", "black"))) +
    scale_x_discrete(position = "top") +
    ylab(NULL) +
    xlab("Normalization \n performance")
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## Differential accessibility analysis
dfAllNorm <- dfAll_noGC[dfAll_noGC$class == "DAA",]
dfAllNormSummary <- dfAllNorm %>% 
  group_by(dataset, method) %>%
  summarize(meanRank = mean(rank))
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
dfAllNormSummary$method <- factor(dfAllNormSummary$method, levels=orderedMethods)
dfAllNormSummary$dataset <- factor(dfAllNormSummary$dataset,
                                   levels = c("bryois2018", "calderon2019", "fullard2019",
                                              "murphy2019","torre-ubieta2018",
                                              "philip2017", "rizzardi2019",
                                               "liu2019"))

pDAAll <- ggplot(dfAllNormSummary, aes(x=dataset, y=method, fill=meanRank)) +
    geom_tile() +
    scale_fill_gradient2(low = "skyblue", high="steelblue") +
    theme(legend.position = "none",
          axis.text.y = element_blank(),
          axis.ticks.y = element_blank(),
          panel.background = element_blank(),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
        axis.text.x = element_text(angle = 25, size=9, vjust=.8, hjust=.35)) +
    scale_x_discrete(position = "top") +
    xlab("Differential analysis \n performance") +
    ylab(NULL)



cowplot::plot_grid(pNormAll, pDAAll,
                     nrow = 1,
                     rel_widths = c(1.5, 1))

ggsave("~/Dropbox/research/atacseq/bulk/plots/totalBenchmarkSummary_noGC.pdf",
       height = 6, width = 10)

For each benchmark component separately

Normalization

# get order across all
dfAllOnlyNorm <- dfAll[dfAll$class == "norm",]
avNormRanks <- dfAllOnlyNorm %>%
  group_by(method) %>%
  summarize(avRank = mean(rank, na.rm=TRUE))
orderedMethods <- as.character(avNormRanks$method[order(avNormRanks$avRank, decreasing=FALSE)])

dfAllNormSummary <- dfAllOnlyNorm %>% 
  group_by(dataset, method) %>%
  summarize(meanRank = mean(rank))
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
dfAllNormSummary$method <- factor(dfAllNormSummary$method, levels=orderedMethods)
dfAllNormSummary$dataset <- factor(dfAllNormSummary$dataset,
                                   levels = c("bryois2018", "calderon2019", "fullard2019",
                                              "murphy2019","torre-ubieta2018",
                                              "philip2017", "rizzardi2019",
                                               "liu2019"))

pNormOnly <- ggplot(dfAllNormSummary, aes(x=dataset, y=method, fill=meanRank)) +
    geom_tile() +
    scale_fill_gradient2() +
  theme(legend.position = "none",
          panel.background = element_blank(),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
        axis.text.x = element_text(angle = 25, size=9, vjust=.8, hjust=.35),
        axis.text.y = element_text(size = 14, 
                                   colour=ifelse(orderedMethods %in% gcNormMethods,
                                                 "dodgerblue", "black"))) +
    scale_x_discrete(position = "top") +
    ylab(NULL) +
    xlab("Normalization \n performance")
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
pNormOnly

Differential accessibility analysis

# get order across all
dfAllOnlyDA <- dfAll[dfAll$class == "DAA",]
avNormRanks <- dfAllOnlyDA %>%
  group_by(method) %>%
  summarize(avRank = mean(rank, na.rm=TRUE))
orderedMethods <- as.character(avNormRanks$method[order(avNormRanks$avRank, decreasing=FALSE)])

dfAllNormSummary <- dfAllOnlyDA %>% 
  group_by(dataset, method) %>%
  summarize(meanRank = mean(rank))
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
dfAllNormSummary$method <- factor(dfAllNormSummary$method, levels=orderedMethods)
dfAllNormSummary$dataset <- factor(dfAllNormSummary$dataset,
                                   levels = c("bryois2018", "calderon2019", "fullard2019",
                                              "murphy2019","torre-ubieta2018",
                                              "philip2017", "rizzardi2019",
                                               "liu2019"))

pDAOnly <- ggplot(dfAllNormSummary, aes(x=dataset, y=method, fill=meanRank)) +
    geom_tile() +
    scale_fill_gradient2(low = "skyblue", high="steelblue") +
  theme(legend.position = "none",
          panel.background = element_blank(),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
        axis.text.x = element_text(angle = 25, size=9, vjust=.8, hjust=.35),
        axis.text.y = element_text(size = 14, 
                                   colour=ifelse(orderedMethods %in% gcNormMethods,
                                                 "dodgerblue", "black"))) +
    scale_x_discrete(position = "top") +
    ylab(NULL) +
    xlab("Differential analysis \n performance")
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
pDAOnly

GC-content bias removal

# get order across all
dfAllOnlyGC <- dfAll[dfAll$class == "GC",]
avNormRanks <- dfAllOnlyGC %>%
  group_by(method) %>%
  summarize(avRank = mean(rank, na.rm=TRUE))
orderedMethods <- as.character(avNormRanks$method[order(avNormRanks$avRank, decreasing=FALSE)])

dfAllNormSummary <- dfAllOnlyGC %>% 
  group_by(dataset, method) %>%
  summarize(meanRank = mean(rank))
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
dfAllNormSummary$method <- factor(dfAllNormSummary$method, levels=orderedMethods)
dfAllNormSummary$dataset <- factor(dfAllNormSummary$dataset,
                                   levels = c("bryois2018", "calderon2019", "fullard2019",
                                              "murphy2019","torre-ubieta2018",
                                              "philip2017", "rizzardi2019",
                                               "liu2019"))

pGCOnly <- ggplot(dfAllNormSummary, aes(x=dataset, y=method, fill=meanRank)) +
    geom_tile() +
    scale_fill_gradient2(low = "darkseagreen2", high="darkseagreen4") +
  theme(legend.position = "none",
          panel.background = element_blank(),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
        axis.text.x = element_text(angle = 25, size=9, vjust=.8, hjust=.35),
        axis.text.y = element_text(size = 14, 
                                   colour=ifelse(orderedMethods %in% gcNormMethods,
                                                 "dodgerblue", "black"))) +
    scale_x_discrete(position = "top") +
    ylab(NULL) +
    xlab("GC-content bias \n removal")
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
pGCOnly

cowplot::plot_grid(pNormOnly,
                   pDAOnly,
                   pGCOnly,
                   labels=letters[1:3],
                   nrow = 1)

ggsave("~/Dropbox/research/atacseq/bulk/plots/totalBenchmark_perComponent.pdf",
       width = 12, height = 6)

All measures for all datasets separately

dfAllOnlyNorm <- dfAll[dfAll$class == "norm",]
dfAllOnlyDA <- dfAll[dfAll$class == "DAA",]
dfAllOnlyGC <- dfAll[dfAll$class == "GC",]
pDatList <- list()


pDatList <- lapply(1:8, function(dd){ #loop across datasets
  dfAllOnlyNormDat <- dfAllOnlyNorm[dfAllOnlyNorm$dataset == unique(dfAllOnlyNorm$dataset)[dd],]
  avNormRanksDat <- dfAllOnlyNormDat %>%
    group_by(method) %>%
    summarize(avRank = mean(rank, na.rm=TRUE))
  orderedMethodsDat <- as.character(avNormRanksDat$method[order(avNormRanksDat$avRank, decreasing=FALSE)])
  
  dfAllNormSummary <- dfAllOnlyNormDat %>% 
    group_by(dataset, method) %>%
    summarize(meanRank = mean(rank))
  dfAllNormSummary$method <- factor(dfAllNormSummary$method, levels=orderedMethodsDat)
  dfAllOnlyNormDat$method <- factor(dfAllOnlyNormDat$method, levels=orderedMethodsDat)

  
  pNormOnlyDat <- ggplot(dfAllOnlyNormDat, aes(x=metric, y=method, fill=rank)) +
      geom_tile() +
      scale_fill_gradient2() +
    theme(legend.position = "none",
            panel.background = element_blank(),
            panel.grid.major = element_blank(), 
            panel.grid.minor = element_blank(),
          axis.text.x = element_text(angle = 25, size=9, vjust=.8, hjust=.35),
          axis.text.y = element_text(size = 14, 
                                     colour=ifelse(orderedMethodsDat %in% gcNormMethods,
                                                   "dodgerblue", "black"))) +
      scale_x_discrete(position = "top") +
      ylab(NULL) +
      xlab("Normalization \n performance")

  ## DAA
  dfAllOnlyDADat <- dfAllOnlyDA[dfAllOnlyDA$dataset == unique(dfAllOnlyDA$dataset)[dd],]
  avDARanksDat <- dfAllOnlyDADat %>%
    group_by(method) %>%
    summarize(avRank = mean(rank, na.rm=TRUE))
  orderedMethodsDat <- as.character(avDARanksDat$method[order(avDARanksDat$avRank, decreasing=FALSE)])
  
  dfAllDASummary <- dfAllOnlyDADat %>% 
    group_by(dataset, method) %>%
    summarize(meanRank = mean(rank))
  dfAllDASummary$method <- factor(dfAllDASummary$method, levels=orderedMethodsDat)
  dfAllOnlyDADat$method <- factor(dfAllOnlyDADat$method, levels=orderedMethodsDat)

  pDAOnlyDat <- ggplot(dfAllOnlyDADat, aes(x=metric, y=method, fill=rank)) +
      geom_tile() +
    scale_fill_gradient2(low = "skyblue", high="steelblue") +
    theme(legend.position = "none",
            panel.background = element_blank(),
            panel.grid.major = element_blank(), 
            panel.grid.minor = element_blank(),
          axis.text.x = element_text(angle = 25, size=9, vjust=.8, hjust=.35),
          axis.text.y = element_text(size = 14, 
                                     colour=ifelse(orderedMethodsDat %in% gcNormMethods,
                                                   "dodgerblue", "black"))) +
      scale_x_discrete(position = "top") +
      ylab(NULL) +
      xlab("Differential analysis\nperformance")

  
  # GC-content
  dfAllOnlyGCDat <- dfAllOnlyGC[dfAllOnlyGC$dataset == unique(dfAllOnlyGC$dataset)[dd],]
  avGCRanksDat <- dfAllOnlyGCDat %>%
    group_by(method) %>%
    summarize(avRank = mean(rank, na.rm=TRUE))
  orderedMethodsDat <- as.character(avGCRanksDat$method[order(avGCRanksDat$avRank, decreasing=FALSE)])
  
  dfAllGCSummary <- dfAllOnlyGCDat %>% 
    group_by(dataset, method) %>%
    summarize(meanRank = mean(rank))
  dfAllGCSummary$method <- factor(dfAllGCSummary$method, levels=orderedMethodsDat)
  dfAllOnlyGCDat$method <- factor(dfAllOnlyGCDat$method, levels=orderedMethodsDat)

  pGCOnlyDat <- ggplot(dfAllOnlyGCDat, aes(x=metric, y=method, fill=rank)) +
      geom_tile() +
    scale_fill_gradient2(low = "darkseagreen2", high="darkseagreen4") +
    theme(legend.position = "none",
            panel.background = element_blank(),
            panel.grid.major = element_blank(), 
            panel.grid.minor = element_blank(),
          axis.text.x = element_text(angle = 25, size=9, vjust=.8, hjust=.35),
          axis.text.y = element_text(size = 14, 
                                     colour=ifelse(orderedMethodsDat %in% gcNormMethods,
                                                   "dodgerblue", "black"))) +
      scale_x_discrete(position = "top") +
      ylab(NULL) +
      xlab("GC-content bias \n removal")

  pDat <- cowplot::plot_grid(pNormOnlyDat,
                             pDAOnlyDat,
                             pGCOnlyDat,
                             nrow = 1, ncol=3)
  curDataset <- datasets[dd]
  substr(curDataset,1,1) <- toupper(substr(curDataset,1,1))
  pDat <- pDat + geom_text(mapping = aes(x=0.08,y=.95, label=curDataset),
                   fontface="bold")
  
  return(pDat)
})
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
## `summarise()` has grouped output by 'dataset'. You can override using the `.groups` argument.
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
cowplot::plot_grid(plotlist=pDatList[1:4], nrow=4, ncol=1)

ggsave("../../../../plots/benchmark_allMeasures_perDataset1.pdf",
       width=8, height=10)

cowplot::plot_grid(plotlist=pDatList[5:8], nrow=4, ncol=1)

ggsave("../../../../plots/benchmark_allMeasures_perDataset2.pdf",
       width=8, height=10)

Concordance analysis on DA peaks

calcJaccard <- function(x, y){
  sum(which(x) %in% which(y)) / length(unique(c(which(x), which(y))))
}
pairwiseJaccard <- function(mat){
  combs <- combn(x=1:ncol(mat), m=2)
  jacMat <- matrix(NA, nrow=ncol(mat), ncol=ncol(mat))
  rownames(jacMat) <- colnames(jacMat) <- colnames(mat)
  for(cc in 1:ncol(combs)){
    jacMat[combs[1,cc], combs[2,cc]] <- calcJaccard(curde[,combs[1,cc]],
                                                    curde[,combs[2,cc]])
  }
  # make symmetric
  diag(jacMat) <- NA
  jacMat[lower.tri(jacMat)] <- t(jacMat)[lower.tri(jacMat)]
  return(jacMat)
}

jacMatList <- list()
curdeList <- list()
library(iCOBRA)
## This version of bslib is designed to work with shiny version 1.5.0.9007 or higher.
for(kk in 1:length(cbdList)){
  curpadj <- apply(pval(cbdList[[kk]]), 2, p.adjust, method="fdr")
  curpadj[is.na(curpadj)] <- 1
  curde <- curpadj <= 0.05
  curdeList[[kk]] <- curde
  curJacMat <- pairwiseJaccard(curde)
  rn <- rownames(curJacMat)
  rn[rn == "gcqn"] <- "GC-FQ"
  rn[rn == "edaseq"] <- "FQ-FQ"
  rn[rn == "gcqn_smooth"] <- "GC-FQ_smooth"
  rn[rn == "uq"] <- "UQ"
  rn[rn == "none"] <- "None"
  rn[rn == "sum"] <- "Sum"
  rownames(curJacMat) <- colnames(curJacMat) <- rn
  jacMatList[[kk]] <- curJacMat
}

## average jaccard
avJaccard <- Reduce('+', jacMatList) / length(jacMatList)
library(pheatmap)
diag(avJaccard) <- NA
pheatmap(avJaccard, border=NA)

phavjac <- pheatmap(avJaccard, border=NA, fontsize=18)

Concordance analysis on clustering

changeNames <- function(x){
  names(x) <- unlist(lapply(strsplit(names(x), split=","), "[[", 2))
  names(x)[names(x) == "gcqn_median_50"] <- "GC-FQ"
  names(x)[names(x) == "gcqn_smooth"] <- "GC-FQ_smooth"
  names(x)[names(x) == "edaseq"] <- "FQ-FQ"
  names(x)[names(x) == "fq"] <- "FQ"
  if("cqn_length" %in% names(x)){
    x <- x[!names(x) == "cqn"]
    names(x)[names(x) == "cqn_length"] <- "cqn"
  }
  names(x)[names(x) == "tmm"] <- "TMM"
  names(x)[names(x) == "deseq"] <- "DESeq2"
  names(x)[names(x) == "uq"] <- "UQ"
  names(x)[names(x) == "none"] <- "None"
  names(x)[names(x) == "sum"] <- "Sum"
  return(x)
}

bioSil <- lapply(scores, function(x) x[,"BIO_SIL"]) 
bioSil <- lapply(bioSil, changeNames)
bioSil <- lapply(bioSil, function(x) x[order(names(x))])
bioSil <- lapply(bioSil, function(x) x - mean(x))
bioSilMat <- do.call(rbind, bioSil)
# avSil <- Reduce('+', bioSil) / length(bioSil)
par(bty='l')
boxplot(bioSilMat, ylab="", ylab="")
## Warning in bxp(list(stats = structure(c(-0.0469545675699299,
## -0.0079496414579699, : Duplicated argument ylab = "" is disregarded
abline(h=0, col="red", lty=2)

library(ggplot2)
bioSilMat <- bioSilMat[,order(apply(bioSilMat,2,median), decreasing=TRUE)]
orderedMethods <- colnames(bioSilMat)
dfLong <- tidyr::gather(as.data.frame(bioSilMat))
colnames(dfLong) <- c("method", "value")
dfLong$method <- factor(dfLong$method, levels=colnames(bioSilMat))
pSil <- ggplot(dfLong, aes(x=method, y=value)) +
  theme_classic() + 
  geom_boxplot() +
  geom_hline(yintercept = 0, col="red", lty=2) +
  ylab("Deviation from average silhouette") +
  xlab("") +
  theme(axis.text.x = element_text(angle = 360-25, size=18,
                                   color=ifelse(orderedMethods %in% gcNormMethods,
                                                 "dodgerblue", "black")),
        axis.text.y = element_text(size=18),
        axis.title.y = element_text(size=18))
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
pSil

cowplot::plot_grid(phavjac[[4]], pSil, labels=letters[1:2],
                   rel_widths = c(2/3, 1/3))

ggsave("~/Dropbox/research/atacseq/bulk/plots/totalBenchmark_concordance.pdf",
       height = 10, width = 20)

Interaction of variable peak width with GC-bias

Are datasets that use a constant peak width different as compared to datasets using variable peak widths with respect to sample-specific GC-content bias? Note we need to switch the sign again as these were all metrics where a high value was bad.

medianPeakWidths <- readRDS("../../data/medianPeakWidths.rds")

## Hellinger distance with uniform distribution across GC bins for mock
helDistList <- lapply(simulationScores, function(x) x[[1]]$helDistVarGC)
tmmHelDist <- unlist(lapply(helDistList, function(x) x[names(x) == "TMM"]))
df <- data.frame(helDist = -tmmHelDist,
                 dataset = names(helDistList),
                 peakSize = "variable")
df$peakSize <- as.character(df$peakSize)
df$peakSize[df$dataset %in% c("bryois2018", "murphy2019")] <- "constant"
df$peakSize <- factor(df$peakSize)
df$medianWidth <- c(301, 492, 385, 980, 485, 201, 458, 500)

pHelUnif <- ggplot(df, aes(x=peakSize, y=helDist, col=dataset)) +
  geom_point() +
  theme_classic() +
  ggtitle("P-val uniform wrt GC") +
  theme(legend.position = "none") +
  xlab("Peak width") +
  ylab("Hellinger distance")

pHelUnifWidth <- ggplot(df, aes(x=medianWidth, y=helDist, col=dataset)) +
  geom_point() +
  theme_classic() +
  ggtitle("P-val uniform wrt GC") +
  theme(legend.position = "none") +
  xlab("Median peak width") +
  ylab("Hellinger distance")


## Hellinger distance with true GC distribution of DA peaks
helDistList <- lapply(simulationScores, function(x) x[[2]]$DAGCDistDiff)
tmmHelDist <- unlist(lapply(helDistList, function(x) x[names(x) == "TMM"]))
df <- data.frame(helDist = -tmmHelDist,
                 dataset = names(helDistList),
                 peakSize = "variable")
df$peakSize <- as.character(df$peakSize)
df$peakSize[df$dataset %in% c("bryois2018", "murphy2019")] <- "constant"
df$peakSize <- factor(df$peakSize)
df$medianWidth <- c(301, 492, 385, 980, 485, 201, 458, 500)


pHelDA <- ggplot(df, aes(x=peakSize, y=helDist, col=dataset)) +
  geom_point() +
  theme_classic() +
  ggtitle("GC dist of DA peaks") +
  theme(legend.position = "none") +
  xlab("Peak width") +
  ylab("Cumulative distance")

pHelDAWidth <- ggplot(df, aes(x=medianWidth, y=helDist, col=dataset)) +
  geom_point() +
  theme_classic() +
  ggtitle("GC dist of DA peaks") +
  theme(legend.position = "none") +
  xlab("Median peak width") +
  ylab("Cumulative distance")


## RLE MED GC
rleGCList <- lapply(scores, function(x) x[,"rleGC_med"])
tmmrleGC <- unlist(lapply(rleGCList, function(x) x[grep(x=names(x), pattern="tmm")]))
df <- data.frame(rleGC = -tmmrleGC,
                 dataset = names(scores),
                 peakSize = "variable")
df$peakSize <- as.character(df$peakSize)
df$peakSize[df$dataset %in% c("bryois2018", "murphy2019")] <- "constant"
df$peakSize <- factor(df$peakSize)
df$medianWidth <- c(301, 492, 385, 980, 485, 201, 458, 500)

pRLEGC <- ggplot(df, aes(x=peakSize, y=rleGC, col=dataset)) +
  geom_point() +
  theme_classic() +
  ggtitle("RLE_MED GC") +
  xlab("Peak width") +
  ylab("RLE value")

pRLEGCWidth <- ggplot(df, aes(x=medianWidth, y=rleGC, col=dataset)) +
  geom_point() +
  theme_classic() +
  ggtitle("RLE_MED GC") +
  xlab("median peak width") +
  ylab("RLE value")

cowplot::plot_grid(pHelUnif, pHelDA, pRLEGC, 
                   nrow=1, labels=letters[1:3],
                   rel_widths = c(0.6, 0.6, 1))

ggsave("~/Dropbox/research/atacseq/bulk/plots/GCPeakWidth.pdf",
       height = 6, width = 11)

cowplot::plot_grid(pHelUnifWidth, pHelDAWidth, pRLEGCWidth, 
                   nrow=1, labels=letters[1:3],
                   rel_widths = c(0.6, 0.6, 1))

ggsave("~/Dropbox/research/atacseq/bulk/plots/GCPeakmedianWidth.pdf",
       height = 6, width = 11)